*RCRA Nationwide Hedonic Study
*Housing Transaction Data Descriptive Statistics
*Created: 5/20/2020
*Created by: Dennis Guignet
*Last Revised: 05/22/2023
*Last Revised by: Dennis Guignet

********************************************************************************

*This do-file takes the completed transaction dataset of all transactions in 
*	the contiguous US that are within five kilometers of a TSDF under RCRA, 
*	and presents some descriptive stats for graphs and tables.  Most noteably, 
*	the initial code below is used to generate Table 2 in the main text. 
*	Subsequent code below is used to generate several tables and figures in 
*	appendices B.2, B.3, F.1, and F.2. 

********************************************************************************
********************************************************************************

*set empty cells for factor variables to drop
set emptycells drop
clear all
*increase max variables allowed b/c factor variables
set maxvar 100000


********************************************************************************

*Gather home transaction descriptive statistics for Table 2 in final paper. 

*Bring in full dataset of transactions within 5km of any TSDF.
use "$salesfolder\All_Sales_Final_Cleaned_TSD5k", clear
count		
gen dTSD0_5000=(dTSD0_750+dTSD750_1500+dTSD1500_5000>0)
gen dTSD0_1500=(dTSD0_750+dTSD750_1500>0)
gen cntTSD0_1500= cntTSD0_750+cntTSD750_1500
*gen cntTSD0_5000=cntTSD0_750+cntTSD750_1500+cntTSD1500_5000

*keep only necessary vars to try and speed things up
keep rprice acres acres_miss stories stories_miss bathtot bathtot_miss sqftstrc sqftstrc_miss  age age_miss ///
	p_nbdev_2011_200 p_nbdev_2011_500 hwy500m lake500m river250m ///
	cntTSD0_750 cntTSD0_1500 cntTSD0_5000 cntTSD0_250 cntTSD250_500 cntTSD500_750 cntTSD750_1000

*Descriptive statistics used to generate Table 2 in main text. Note that statistics
*	for variables with missing values are replaced with stats from subsequent "sum"
*	commands presented immediately elow. 
sum rprice acres_miss stories_miss bathtot_miss sqftstrc_miss age_miss ///
	p_nbdev_2011_200 p_nbdev_2011_500 hwy500m /*lake500m river250m*/ ///
	cntTSD0_750 cntTSD0_1500 cntTSD0_5000 
sum acres if acres_miss==0
sum stories if stories_miss==0
sum bathtot if bathtot_miss==0
sum sqftstrc if sqftstrc_miss==0 
sum age if age_miss==0
	*Note: Series of sum commands needed because only want descriptive stats 
	*	over nonmissing values. 
	
*number of TSDFs in each bin
tab cntTSD0_250
tab cntTSD250_500
tab cntTSD500_750
tab cntTSD750_1000


********************************************************************************
********************************************************************************

*Additional descriptive stats for number of TSDs, pre, mid, and post, within 5000-meters
	*Note: Decided to do this by creating data table and exporting to excel. 

*Create table of counts for Figure A3 in Appendix B.2 of final paper.
	
*Bring in full dataset of transactions within 5km of any TSDF.
use "$salesfolder\All_Sales_Final_Cleaned_TSD5k", clear
count		

*Drop all unnecessary variable to try and speed things up. 	
keep dTSD0_250 dTSD250_500 dTSD500_750 dTSD750_1000 ///
	dTSD1000_1250 dTSD1250_1500 dTSD1500_1750 dTSD1750_2000 ///
	dTSD2000_2250 dTSD2250_2500 dTSD2500_2750 dTSD2750_3000 ///
	dTSD3000_3250 dTSD3250_3500 dTSD3500_3750 dTSD3750_4000 ///
	dTSD4000_4250 dTSD4250_4500 dTSD4500_4750 dTSD4750_5000 
local vars dTSD0_250 dTSD250_500 dTSD500_750 dTSD750_1000 ///
	dTSD1000_1250 dTSD1250_1500 dTSD1500_1750 dTSD1750_2000 ///
	dTSD2000_2250 dTSD2250_2500 dTSD2500_2750 dTSD2750_3000 ///
	dTSD3000_3250 dTSD3250_3500 dTSD3500_3750 dTSD3750_4000 ///
	dTSD4000_4250 dTSD4250_4500 dTSD4500_4750 dTSD4750_5000 
foreach v of local vars {
	egen n_`v'=total(`v')
	}
*keep only first row and newly calculated tabulations
keep if _n==1
keep n_*
xpose, varname clear
order _varname, first
rename v1 sales_cnt
export excel using "$resultsfolder\TSD_SalesCnts_byBins", firstrow(variables) replace


********************************************************************************
********************************************************************************

*Create table of counts for Figure A4 in Appendix B.2 of final paper.

*Bring in full dataset of transactions within 5km of any TSDF.
use "$salesfolder\All_Sales_Final_Cleaned_CA5k", clear
count		

count if dpreCA0_250+dpreCA250_500+dpreCA500_750>1
count if dmidCA0_250+dmidCA250_500+dmidCA500_750>1
count if dpostCA0_250+dpostCA250_500+dpostCA500_750>1
	*Note: Descrepancy between sum of these individual 250-meter bin counts below
	*	versus subsequent pooled 0-750m bin counts is because some sales have,
	*	for example, a mid-CA site in 0-250m and one in 250-500m.  Those are 
	*	counted for each smaller bin here, but only once in subsequent pooled bin
	*	counts. 

*Sales counts for number of pre, mid, and post bins, within 5000-meters
	*Note: Decided to do this by created data table and exporting to excel. 
local vars dpreCA0_250 dpreCA250_500 dpreCA500_750 dpreCA750_1000 ///
	dpreCA1000_1250 dpreCA1250_1500 dpreCA1500_1750 dpreCA1750_2000 ///
	dpreCA2000_2250 dpreCA2250_2500 dpreCA2500_2750 dpreCA2750_3000 ///
	dpreCA3000_3250 dpreCA3250_3500 dpreCA3500_3750 dpreCA3750_4000 ///
	dpreCA4000_4250 dpreCA4250_4500 dpreCA4500_4750 dpreCA4750_5000 ///
	dmidCA0_250 dmidCA250_500 dmidCA500_750 dmidCA750_1000 ///
	dmidCA1000_1250 dmidCA1250_1500 dmidCA1500_1750 dmidCA1750_2000 ///
	dmidCA2000_2250 dmidCA2250_2500 dmidCA2500_2750 dmidCA2750_3000 ///
	dmidCA3000_3250 dmidCA3250_3500 dmidCA3500_3750 dmidCA3750_4000 ///
	dmidCA4000_4250 dmidCA4250_4500 dmidCA4500_4750 dmidCA4750_5000 ///
	dpostCA0_250 dpostCA250_500 dpostCA500_750 dpostCA750_1000 ///
	dpostCA1000_1250 dpostCA1250_1500 dpostCA1500_1750 dpostCA1750_2000 ///
	dpostCA2000_2250 dpostCA2250_2500 dpostCA2500_2750 dpostCA2750_3000 ///
	dpostCA3000_3250 dpostCA3250_3500 dpostCA3500_3750 dpostCA3750_4000 ///
	dpostCA4000_4250 dpostCA4250_4500 dpostCA4500_4750 dpostCA4750_5000 
foreach v of local vars {
	egen n_`v'=total(`v')
	}
*keep only first row and newly calculated tabulations
keep if _n==1
keep n_*
xpose, varname clear
order _varname, first
rename v1 sales_cnt
export excel using "$resultsfolder\CorrAction_SalesCnts_byBins", firstrow(variables) replace
		
		
********************************************************************************
********************************************************************************

*Create counts for Table A4 of Appendix B.3 of final paper. 

*Repeat above tabulation of sales number exercise, but do so for 0-750m bin.
*	Do it once for all sales, and then again for just identifying repeat sales. 
*Bring in full dataset of transactions within 5km of any TSDF.
use "$salesfolder\All_Sales_Final_Cleaned_CA5k", clear
local vars dpreCA0_750 dmidCA0_750 dpostCA0_750 
foreach v of local vars {
	egen n_`v'=total(`v')
	}
*keep only first row and newly calculated tabulations
keep if _n==1
keep n_*
xpose, varname clear
order _varname, first
rename v1 sales_cnt
export excel using "$resultsfolder\CorrAction_SalesCnts_0_750_binOnly", firstrow(variables) replace	


*now do same with CEM dataset
use "$salesfolder\All_Sales_Final_Cleaned_CA1500m_CEM_MatchOnly", clear	
local vars dpreCA0_750 dmidCA0_750 dpostCA0_750 
foreach v of local vars {
	egen n_`v'=total(`v')
	}
*keep only first row and newly calculated tabulations
keep if _n==1
keep n_*
xpose, varname clear
order _varname, first
rename v1 sales_cnt
export excel using "$resultsfolder\CorrAction_SalesCnts_0_750_binOnly_CEMsample", firstrow(variables) replace	


*now do same with just identifying repeat sales
use "$salesfolder\All_Sales_Final_Cleaned_CA5k", clear
isid importparcelid transid, sort
gen tmp=1
by importparcelid: egen salescnt=total(tmp)
*Drop parcels only sold once
drop if salescnt==1
*flag just one unique observation for each parcels
egen tag=tag(importparcelid)
*sum totals of CA stage variables to see how many repeat sales sold in more than
*	one stage. 
by importparcelid: egen sum_preCA0_750=total(dpreCA0_750)
by importparcelid: egen sum_midCA0_750=total(dmidCA0_750)
by importparcelid: egen sum_postCA0_750=total(dpostCA0_750)
*dummies denoting sales that provide identification with parcel FE (i.e., repeat
*	sales) model, meaning same home sold in both corresponding before and after
*	stages. 
gen dpre0_750_RepSale=(sum_preCA0_750>0 & sum_midCA0_750>0 & dpreCA0_750==1)
gen dmidpre0_750_RepSale=(sum_preCA0_750>0 & sum_midCA0_750>0 & dmidCA0_750==1)
gen dmidpost0_750_RepSale=(sum_midCA0_750>0 & sum_postCA0_750>0 & dmidCA0_750==1)
gen dpost0_750_RepSale=(sum_midCA0_750>0 & sum_postCA0_750>0 & dpostCA0_750==1)
*sum up to get observation counts
local vars dpreCA0_750 dmidCA0_750 dpostCA0_750 ///
	dpre0_750_RepSale dmidpre0_750_RepSale dmidpost0_750_RepSale dpost0_750_RepSale
foreach v of local vars {
	egen n_`v'=total(`v')
	}
*keep only first row and newly calculated tabulations
keep if _n==1
keep n_*
xpose, varname clear
order _varname, first
rename v1 sales_cnt
export excel using "$resultsfolder\CorrAction_SalesCnts_0_750_binOnly_RepSales", firstrow(variables) replace

	
	
********************************************************************************
********************************************************************************

*Next compare covariates across different treatment and control groups, for 
*	Tables A12 in Appendix F.1 of final paper. First for unweighted sample of 
*	sales within 0-1500m of a Corrective Action, and then for the corresponding
*	CEM-weighted sample. 


*t-tests between near and far sales around Corrective Action sites
*bring in dataset
use "$salesfolder\All_Sales_Final_Cleaned_CA1500m", clear
count
*create treatment dummy for matching
gen dCA0_750=0
replace dCA0_750=1 if (dpreCA0_750+dmidCA0_750+dpostCA0_750)>0
tab dCA0_750

/*
*mean sale price of homes within 750m during (mid) CA.
sum rprice if dmidCA0_750==1, detail
sum rprice if dmidCA0_750==1 & tranyr==2018, detail
sort tranyr
by tranyr: sum rprice if dmidCA0_750==1
	*Note: Bit of an aside for illustrative policy calculation later. 
*/
	
*create TSD count variables
gen cntTSD0_1500=cntTSD0_750+cntTSD750_1500
*gen cntTSD0_5000=cntTSD0_2500+cntTSD2500_5000

*First house and lot attributes
putexcel set "$resultsfolder\ttests_HouseMeans_CA0_750m_to_CA750_1500m.xlsx", replace
putexcel A1="Variable" B1="Ctrl Grp Mean" C1="Trt Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars acres stories bathtot sqftstrc age 
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	ttest `v' if `v'_miss==0, by(dCA0_750) unequal
	putexcel A`r'="`v'"
	putexcel B`r'=`r(mu_1)'
	putexcel C`r'=`r(mu_2)'
	putexcel D`r'=`r(t)'
	putexcel E`r'=`r(p)'
	local ++r /*adds one to row count*/
	}
*Second location attributes
	*Note: Had to do separate b/c location attributes do not have missing dummy. 
putexcel set "$resultsfolder\ttests_LocationMeans_CA0_750m_to_CA750_1500m.xlsx", replace
putexcel A1="Variable" B1="Ctrl Grp Mean" C1="Trt Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars rprice p_nbdev_2011_200 p_nbdev_2011_500 hwy500m /*lake500m river250m*/ ///
	cntTSD0_1500 cntTSD0_5000
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	ttest `v', by(dCA0_750) unequal
	putexcel A`r'="`v'"
	putexcel B`r'=`r(mu_1)'
	putexcel C`r'=`r(mu_2)'
	putexcel D`r'=`r(t)'
	putexcel E`r'=`r(p)'
	local ++r /*adds one to row count*/
	}

	

*Next compare CEM weighted sample means among treatment and control groups. This
*	corresponds to second set of comparisons in Table A12 in Appendix F.1. 
use "$salesfolder\All_Sales_Final_Cleaned_CA1500m_CEM_MatchOnly", clear	
count
*create control group dummy for t-stats via regression framework.
	*Note: Do t-tests in regression format to accomodate weights. Already have 
	*	treatment group dummy.
gen dCA750_1500=0
replace dCA750_1500=1 if (dpreCA750_1500+dmidCA750_1500+dpostCA750_1500)>0 
tab dCA0_750 dCA750_1500
	*Note: Because multiple TSDFs and CAs sometimes in close proximity, have some 
	*	overlap in groups. Therefore name variable as temp variable so as to not
	*	accidentally use in regressions.  The temp variable excludes observations
	*	from control group if they have any TSDFs within treatment zone. 
drop dCA750_1500
gen dCA750_1500tmp=0
replace dCA750_1500tmp=1 if (dpreCA750_1500+dmidCA750_1500+dpostCA750_1500)>0 & dCA0_750==0
tab dCA0_750 dCA750_1500tmp
*create TSD count variables
gen cntTSD0_1500=cntTSD0_750+cntTSD750_1500
*gen cntTSD0_5000=cntTSD0_2500+cntTSD2500_5000

*Compare group within 750m of TSD with CA to those farther out
*First house attributes
putexcel set "$resultsfolder\ttests_CEMSample_HomeMeans_CA0_750m_to_CA750_1500m.xlsx", replace
putexcel A1="Variable" B1="Ctrl Grp Mean" C1="Trt Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars acres stories bathtot sqftstrc age
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	regress `v' dCA750_1500tmp dCA0_750  if `v'_miss==0 [aweight=cem_weights], noconst
	putexcel A`r'="`v'"
	putexcel B`r'=matrix(e(b))
	regress `v' dCA0_750  if `v'_miss==0 [aweight=cem_weights]
	putexcel D`r'=matrix(r(table)[3,1])
	putexcel E`r'=matrix(r(table)[4,1])
	local ++r /*adds one to row count*/
	}
	*Note: First regression needed to get means from coefficients. Second regression 
	*	needed to give t-stat of whether statistically significant difference. 
*Second location attributes
	*Note: Had to do separate b/c location attributes do not have missing dummy. 
putexcel set "$resultsfolder\ttests_CEMSample_LocationMeans_CA0_750m_to_CA750_1500m.xlsx", replace
putexcel A1="Variable" B1="Ctrl Grp Mean" C1="Trt Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars p_nbdev_2011_200 p_nbdev_2011_500 hwy500m /*lake500m river250m*/ ///
	cntTSD0_1500 cntTSD0_5000
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	regress `v' dCA750_1500tmp dCA0_750 [aweight=cem_weights], noconst
	putexcel A`r'="`v'"
	putexcel B`r'=matrix(e(b))
	regress `v' dCA0_750  [aweight=cem_weights]
	putexcel D`r'=matrix(r(table)[3,1])
	putexcel E`r'=matrix(r(table)[4,1])
	local ++r /*adds one to row count*/
	}

	
********************************************************************************	
********************************************************************************

*Compare covariates before and after treatment. Do this just for treated group.
*	Generates estimates for Tables A15 in Appendix F.2 of final paper.


*t-tests comparing before and after sales in treated (0-750m) zone. 
*bring in dataset
use "$salesfolder\All_Sales_Final_Cleaned_CA1500m", clear
count
*create treatment dummy for matching
gen dCA0_750=0
replace dCA0_750=1 if (dpreCA0_750+dmidCA0_750+dpostCA0_750)>0
tab dCA0_750
keep if dCA0_750==1

*create TSD count variables
gen cntTSD0_1500=cntTSD0_750+cntTSD750_1500
*gen cntTSD0_5000=cntTSD0_2500+cntTSD2500_5000


*Before versus After CA Opening
*First house and lot attributes
putexcel set "$resultsfolder\ttests_HouseMeans_CA0_750m_Pre_to_MidCA.xlsx", replace
putexcel A1="Variable" B1="PreCA Grp Mean" C1="MidCA Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars acres stories bathtot sqftstrc age 
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	ttest `v' if `v'_miss==0 & (dpreCA0_750+dmidCA0_750>0), by(dmidCA0_750) unequal
	putexcel A`r'="`v'"
	putexcel B`r'=`r(mu_1)'
	putexcel C`r'=`r(mu_2)'
	putexcel D`r'=`r(t)'
	putexcel E`r'=`r(p)'
	local ++r /*adds one to row count*/
	}
	*Note: Second part of if-statement needed because only want to compare covariates across 
	*	first treatment event here. 
*Second location attributes
	*Note: Had to do separate b/c location attributes do not have missing dummy. 
putexcel set "$resultsfolder\ttests_LocationMeans_CA0_750m_Pre_to_MidCA.xlsx", replace
putexcel A1="Variable" B1="PreCA Grp Mean" C1="MidCA Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars rprice p_nbdev_2011_200 p_nbdev_2011_500 hwy500m /*lake500m river250m*/ ///
	cntTSD0_1500 cntTSD0_5000
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	ttest `v' if (dpreCA0_750+dmidCA0_750>0), by(dmidCA0_750) unequal
	putexcel A`r'="`v'"
	putexcel B`r'=`r(mu_1)'
	putexcel C`r'=`r(mu_2)'
	putexcel D`r'=`r(t)'
	putexcel E`r'=`r(p)'
	local ++r /*adds one to row count*/
	}
	
	
*Before versus After CA Completion
*First house and lot attributes
putexcel set "$resultsfolder\ttests_HouseMeans_CA0_750m_Mid_to_PostCA.xlsx", replace
putexcel A1="Variable" B1="MidCA Grp Mean" C1="PostCA Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars acres stories bathtot sqftstrc age 
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	ttest `v' if `v'_miss==0 & (dmidCA0_750+dpostCA0_750>0), by(dpostCA0_750) unequal
	putexcel A`r'="`v'"
	putexcel B`r'=`r(mu_1)'
	putexcel C`r'=`r(mu_2)'
	putexcel D`r'=`r(t)'
	putexcel E`r'=`r(p)'
	local ++r /*adds one to row count*/
	}
	*Note: Second part of if-statement needed because only want to compare covariates across 
	*	first treatment event here. 
*Second location attributes
	*Note: Had to do separate b/c location attributes do not have missing dummy. 
putexcel set "$resultsfolder\ttests_LocationMeans_CA0_750m_Mid_to_PostCA.xlsx", replace
putexcel A1="Variable" B1="MidCA Grp Mean" C1="PostCA Grp Mean" D1="tstat" E1="p-val"
	*Note: Set up excel file to store test results.
*list variables to be added
local vars rprice p_nbdev_2011_200 p_nbdev_2011_500 hwy500m /*lake500m river250m*/ ///
	cntTSD0_1500 cntTSD0_5000
*initiate row for entering values into excel
local r 2
foreach v of local vars {
	ttest `v' if (dmidCA0_750+dpostCA0_750>0), by(dpostCA0_750) unequal
	putexcel A`r'="`v'"
	putexcel B`r'=`r(mu_1)'
	putexcel C`r'=`r(mu_2)'
	putexcel D`r'=`r(t)'
	putexcel E`r'=`r(p)'
	local ++r /*adds one to row count*/
	}

	
*END






 